# rename_qss_dint_pdfs.py
# QUANT & DINT Offline Renamer
# Matches saved HTML and PDF files for QSS/DINT issues
# - Extracts <meta name="citation_title">
# - Renames PDFs to clean article titles
# - Removes underscores, MIT Press, invalid characters
# - Logs all renames to CSV

"""
rename_qss_dint_pdfs.py

Offline renamer for QSS/DINT articles.
- Matches saved HTML and PDF files in one folder by save order.
- Extracts <meta name="citation_title"> from each HTML.
- Renames corresponding PDF using the full title.
- Cleans underscores, removes 'MIT Press', strips newlines and invalid characters.
- Creates CSV log of renamed files.
"""


import os
import re
import csv
from bs4 import BeautifulSoup

MAX_FILENAME_LEN = 200

def sanitize(text):
    # Remove newlines and carriage returns
    text = text.replace("\n", " ").replace("\r", " ")
    # Remove invalid filename characters
    clean = re.sub(r'[\\/*?:"<>|]', "", text)
    # Replace underscores with spaces
    clean = clean.replace("_", " ")
    # Remove trailing 'MIT Press' and following text
    clean = re.sub(r"\s+MIT Press.*$", "", clean)
    # Collapse multiple spaces
    clean = re.sub(r"\s+", " ", clean)
    return clean.strip()[:MAX_FILENAME_LEN]

# --- Input folder ---
folder = input("Enter path to folder with HTML and PDF files: ").strip()

# Collect HTML and PDF files sorted by modified time
html_files = sorted([f for f in os.listdir(folder) if f.lower().endswith(".html")],
                    key=lambda x: os.path.getmtime(os.path.join(folder, x)))
pdf_files = sorted([f for f in os.listdir(folder) if f.lower().endswith(".pdf")],
                   key=lambda x: os.path.getmtime(os.path.join(folder, x)))

if len(html_files) != len(pdf_files):
    print(f"[WARNING] Number of HTMLs ({len(html_files)}) and PDFs ({len(pdf_files)}) differ.")
    print("Make sure you saved HTML first then PDF for each article in sequence.")

log_path = os.path.join(folder, "renamed_pdfs_log.csv")
log_file = open(log_path, "w", newline="", encoding="utf-8")
csv_writer = csv.writer(log_file)
csv_writer.writerow(["Original PDF", "New Filename", "Extracted Title"])

count = 0

for html_name, pdf_name in zip(html_files, pdf_files):
    html_path = os.path.join(folder, html_name)
    pdf_path = os.path.join(folder, pdf_name)

    with open(html_path, "r", encoding="utf-8") as f:
        html_content = f.read()

    soup = BeautifulSoup(html_content, "html.parser")
    meta_title = soup.find("meta", {"name": "citation_title"})
    if not meta_title:
        print(f"[SKIP] No citation_title in {html_name}")
        continue

    full_title = meta_title["content"].strip()
    new_name = sanitize(full_title) + ".pdf"
    new_path = os.path.join(folder, new_name)

    idx = 1
    while os.path.exists(new_path):
        new_name = sanitize(full_title) + f" {idx}.pdf"
        new_path = os.path.join(folder, new_name)
        idx += 1

    os.rename(pdf_path, new_path)
    csv_writer.writerow([pdf_name, new_name, full_title])
    count += 1
    print(f"[{count}] {pdf_name} -> {new_name}")

log_file.close()
print(f"\nDone! {count} PDFs renamed.")
print(f"Log file saved as: {log_path}")
